# Install and load the necessary packages
if(!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, utils, data.table, lubridate, sjlabelled, reshape2)

# Distric-level vote shares (Erststimme) ----------------------------------

### 2013 --------------------------------------------------------------------

# Read the raw data file
res_2013 <- read.csv("raw_data/results_2013.csv", sep = ";", skip = 5, header = TRUE, dec = ",", encoding = "latin1")

# Process votes for parties
res_2013_party <- res_2013 %>%
  # Select varaibles indicating vote shares of parties (Erststimme)  as well as electoral district IDs
  select(-starts_with("X."), -c(Gebiet, gehört, X, Wähler, Ungültige, Wahlberechtigte)) %>%
  # Remove rows that just indicate that these are first votes
  filter(!is.na(Nr), Nr < 300) %>%
  # Transfrom variables into numerics
  mutate_all(as.numeric)

# Select vote shares of other parties (other than the six major parties in Germany, including CDU/CSU, SPD, FDP, Greens, The Left, AfD)
others <- colnames(res_2013_party)[!(colnames(res_2013_party)%in% c("Nr", "Gültige", "CDU", "SPD", "FDP", "DIE.LINKE", "GRÜNE", "CSU", "AfD"))]

# Total votes for other parties
res_2013_party$others <-  rowSums(res_2013_party[,others], na.rm = TRUE)


# Votes shares for major parties
results_2013 <- res_2013_party %>% 
  # Make CDU and CSU votes as one variable
  mutate(CDU = dplyr::if_else(is.na(CDU), CSU, CDU)) %>% 
  # Select only district id, total votes, and number of votes for the six major political parties in Germany
  select(Nr, Gültige, CDU, SPD, FDP, DIE.LINKE, GRÜNE, AfD) %>% 
  
  # Calculate vote shares of parties
  mutate_at(vars(-c(Nr, Gültige)), funs(./Gültige * 100)) %>% 
  # Remove "total votes" variable
  select(-Gültige) %>% 
  # Rename variables
  rename(cdu = CDU,
         spd = SPD,
         fdp = FDP,
         left = DIE.LINKE,
         green = GRÜNE,
         afd = AfD, 
         wahlkreis = Nr) %>% 
  # Replace NA values by 0
  replace(is.na(.), 0)

# From wide to long format
res_2013_party <- reshape2::melt(results_2013, id.vars = c("wahlkreis"), variable.name = "party_name") %>% 
  rename(vote_share = value) %>% 
  # Create year variable
  mutate(year = 2013) %>% 
  # Add numeric party identifiers (necessary for merging election results with vote choices of survey respondents)
  mutate(party = dplyr::recode(party_name,
                               "cdu" = 1,
                               "spd" = 4,
                               "fdp" = 5,
                               "green" = 6,
                               "left" = 7,
                               "afd" = 322
  )) %>% 
  
  # Reorder variables
  select(year, wahlkreis, party, party_name, vote_share)




### 2017 --------------------------------------------------------------------

# Read the raw data file
res_2017 <- read.csv("raw_data/results_2017.csv", sep = ";", skip = 9, header = TRUE, dec = ",", encoding = "latin1")

# Process votes for parties
res_2017_party <- res_2017 %>% 
  # Filter rows that contain information on parties' votes shares in electoral districts (Erststimme) 
  filter(Gebietsart == "Wahlkreis", Gruppenart == "Partei", Stimme == 1) %>%
  # Create numeric party identifier
  mutate(party = dplyr::recode(Gruppenname,
                               "CDU" = 1,
                               "CSU" = 1,
                               "SPD" = 4,
                               "FDP" = 5,
                               "GRÃœNE" = 6,
                               "DIE LINKE" = 7,
                               "AfD" = 322,
                               # Others
                               .default = NA_real_
  )) %>%
  
  # Remove vote shares of other parties
  filter(!is.na(party)) %>% 
  
  # Rename variables
  rename(wahlkreis = Gebietsnummer,
         vote_share = Prozent
  ) %>% 
  # Select relevant variables (electoral district, party, vote_share)
  select(wahlkreis, party, vote_share) %>% 
  # If NA vote share because party did not run -> set to 0
  mutate(vote_share = dplyr::if_else(is.na(vote_share), 0, vote_share)) %>% 
  
  # Create year variable
  mutate(year = 2017) %>%
  # Create party name variable based on numeric party id
  mutate(party_name = dplyr::recode(party,
                                    "1" = "cdu",
                                    "4" = "spd",
                                    "5" = "fdp",
                                    "6" = "green",
                                    "7" = "left",
                                    "322" = "afd"
  )) %>% 
  
  # Reorder variables
  select(year, wahlkreis, party, party_name, vote_share)




### 2021 --------------------------------------------------------------------

# Read the raw data file
res_2021 <- read.csv("raw_data/results_2021.csv", sep = ";", skip = 9, header = TRUE, dec = ",", encoding = "latin1")

# Process votes for parties
res_2021_party <- res_2021 %>% 
  # Filter rows that contain information on parties' votes shares in electoral districts (Erststimme) 
  filter(Gebietsart == "Wahlkreis", Gruppenart == "Partei", Stimme == 1) %>%
  # Create numeric party identifier
  mutate(party = dplyr::recode(Gruppenname,
                               "CDU" = 1,
                               "CSU" = 1,
                               "SPD" = 4,
                               "FDP" = 5,
                               "GRÃœNE" = 6,
                               "DIE LINKE" = 7,
                               "AfD" = 322,
                               .default = NA_real_
  )) %>%
  
  # Remove vote shares of other parties
  filter(!is.na(party)) %>% 
  
  # Rename variables
  rename(wahlkreis = Gebietsnummer,
         vote_share = Prozent
  ) %>% 
  # Select relevant variables (electoral district, party, vote_share)
  select(wahlkreis, party, vote_share) %>% 
  # If NA vote share because party did not run -> set to 0
  mutate(vote_share = dplyr::if_else(is.na(vote_share), 0, vote_share)) %>% 
  
  # Create year variable
  mutate(year = 2021) %>%
  # Create party name variable based on numeric party id
  mutate(party_name = dplyr::recode(party,
                                    "1" = "cdu",
                                    "4" = "spd",
                                    "5" = "fdp",
                                    "6" = "green",
                                    "7" = "left",
                                    "322" = "afd"
  )) %>% 
  
  # Reorder variables
  select(year, wahlkreis, party, party_name, vote_share)







### Merge the data sets of 2009, 2013, 2017 and 2021 and determine first, second, and third candidate in each district-election ---------

# Bind different frames of official election results from the three elections together
official_results <- rbind(res_2013_party, res_2017_party, res_2021_party)

# Information on the winning party 
winner <- official_results %>% 
  group_by(wahlkreis, year) %>% 
  arrange(desc(vote_share)) %>%
  slice(1) %>% 
  rename(winner = party,
         winner_name = party_name,
         winner_vote_share = vote_share)

# Information on the party in second place
second <- official_results %>% 
  group_by(wahlkreis, year) %>% 
  arrange(desc(vote_share)) %>%
  slice(2) %>% 
  rename(second = party,
         second_name = party_name,
         second_vote_share = vote_share)


# Combine information on first and second party
winner_second <- left_join(winner, second, by = c("wahlkreis", "year"))

# Join this together with election results for all parties
districts_results <- left_join(official_results, winner_second, by = c("wahlkreis", "year")) %>% 
  
  # Number of the six major political parties that ran in the district race
  group_by(wahlkreis, year) %>% 
  mutate(n_parties = sum(vote_share > 0)) %>% 
  ungroup() %>% 
  
  # Calculate the margin of victory/loss
  mutate(margin = case_when(
    party == winner ~ vote_share - second_vote_share,
    party != winner ~ vote_share - winner_vote_share
  )) 




# Individual-level data: GLES ---------------------------------------------

# Load the raw data set
gles_cum <- haven::read_dta("raw_data/ZA6835_v2-0-0.dta")

survey_data <- gles_cum %>% 
  # Remove all labels from dta data set
  sjlabelled::remove_all_labels() %>%
  
  # Exclude 2009 as the dependent variable was not measured in this year
  mutate(year = intyear) %>% 
  filter(year != 2009) %>% 
  
  # Indicator whether interview was conducted before or after election
  mutate(post = ifelse(sample %in% c(4,6,8), 1, 0)) %>% 
  
  # First vote (vote decision if post-election; vote intention if pre-election)
  mutate(erststimme = ifelse(post == 1, v13aa, v9aa)) %>% 
  # Second vote (vote decision if post-election; vote intention if pre-election)
  mutate(zweitstimme = ifelse(post == 1, v13ba, v9ba)) %>% 
  
  # Electoral district
  mutate(wahlkreis = case_when(year == 2013 ~ wknr_13, 
                               year == 2017 ~ wknr_17,
                               year == 2021 ~ wknr_21
                               )
         ) %>% 
  
  # Duplicate variable to analyse missing values
  mutate(erststimme_raw = erststimme,
         zweitstimme_raw = zweitstimme) %>%
  
  # Views on representation
  rename(rep_voters = v130a,
         rep_all = v130b,
         rep_party = v130c,
         rep_nation = v130e) %>% 
  
  # Duplicate variable to analyse missing values
  mutate(rep_voters_raw = rep_voters,
         rep_all_raw = rep_all) %>% 
  
  # Birth year
  mutate(birth_year = case_when(
    d3c == "-99 keine Angabe" ~ NA_real_,
    d3c == "1919 oder frueher" ~ 1919,
    TRUE ~ as_numeric(d3c)
  )) %>% 
  
  # Rename variables
  rename(
    sex = d1,
    education = d5 ,
    interest = v132,
    knowledge = v133,
    weight = w_ipfges_1
  ) %>% 
  
  # Select relevant variables 
  select(
    year,
    post,
    wahlkreis,
    erststimme,
    zweitstimme,
    erststimme_raw,
    zweitstimme_raw,
    rep_voters,
    rep_all,
    rep_party,
    rep_nation,
    rep_voters_raw,
    rep_all_raw,
    birth_year,
    sex,
    education,
    interest,
    knowledge,
    weight
  ) %>% 
  
  # Create "male" variable
  mutate(male = dplyr::recode(sex,
                              `2` = 0,
                              `-99` = NA_real_)) %>% 
  
  
  # Calculate age of respondents
  mutate(age = year - birth_year) %>% 
  
  # Assess whether answer to political knowledge variable was correct
  mutate(knowledge = case_when(knowledge == 2 ~ 1,
                               knowledge %in% c(-98, -73, 1, 3) ~ 0,
                               knowledge == -99 ~ NA_real_
  )) %>% 
  
  # Set missing values to NA for education and interest variable
  mutate(education = dplyr::recode(education, `-99` = NA_real_, `6` = NA_real_, `9` = NA_real_, `-93` = NA_real_, `-73` = NA_real_ ),
         interest = dplyr::recode(interest, `-99` = NA_real_, `-98` = NA_real_, `-73` = NA_real_)
  ) %>% 
  
  # Recode first and second votes (other categories)
  mutate(erststimme = dplyr::recode(erststimme, `206` = 801, `215` = 801),
         zweitstimme = dplyr::recode(zweitstimme, `206` = 801, `215` = 801)) %>% 
  
  # Set Zweitstimme to NA if missing
  mutate(zweitstimme = ifelse(zweitstimme > 0, zweitstimme, NA_real_)) %>%  
  
  # Views on representation: Set missing values to NA
  mutate(rep_voters = dplyr::recode(rep_voters, `-99` = NA_real_, `-98` = NA_real_, `-93` = NA_real_, `-73` = NA_real_),
         rep_all = dplyr::recode(rep_all, `-99` = NA_real_, `-98` = NA_real_, `-93` = NA_real_, `-73` = NA_real_),
         rep_party = dplyr::recode(rep_party, `-99` = NA_real_, `-98` = NA_real_, `-93` = NA_real_, `-73` = NA_real_),
         rep_nation = dplyr::recode(rep_nation, `-99` = NA_real_, `-98` = NA_real_, `-93` = NA_real_, `-73` = NA_real_)
  ) %>% 
  
  
  # Reorder views on representation, so that higher values indicate higher importance
  mutate(
    rep_voters = dplyr::recode(rep_voters,
                               `1` = 5,
                               `2` = 4,
                               `3` = 3,
                               `4` = 2,
                               `5` = 1),
    rep_all = dplyr::recode(rep_all,
                            `1` = 5,
                            `2` = 4,
                            `3` = 3,
                            `4` = 2,
                            `5` = 1),
    rep_party = dplyr::recode(rep_party,
                              `1` = 5,
                              `2` = 4,
                              `3` = 3,
                              `4` = 2,
                              `5` = 1),
    rep_nation = dplyr::recode(rep_nation,
                               `1` = 5,
                               `2` = 4,
                               `3` = 3,
                               `4` = 2,
                               `5` = 1),
    
    # Same for political interest
    interest = dplyr::recode(interest,
                             `1` = 5,
                             `2` = 4,
                             `3` = 3,
                             `4` = 2,
                             `5` = 1)
  ) %>% 
  
  # Relative importance outcome
  mutate(relative_importance = rep_voters - rep_all)  %>% 
  
  # Binary relative importance
  mutate(relative_binary = dplyr::if_else(relative_importance > 0, 1, 0)) %>% 
  
  # Ration relative importance 
  rowwise() %>% 
  mutate(relative_ratio = (rep_voters - rep_all) / max(rep_voters, rep_all)) %>% 
  ungroup() %>% 
  
  # Remove unnecessary variables
  select(-c(sex, birth_year))  











# Merge district- and individual-level data -------------------------------

survey_data <- left_join(survey_data, districts_results, by = c("wahlkreis", "year", "erststimme" = "party")) %>% 
  
  # Create identifier of each election race
  mutate(cluster = paste(year, "_", wahlkreis, sep = "")) %>% 
  
  # Years as factor variable
  mutate(year = as.factor(year)) %>% 
  
  # Reorder variables
  select(
    year,
    wahlkreis,
    cluster,
    post,
    male,
    age,
    education,
    interest,
    knowledge,
    rep_voters,
    rep_all,
    rep_party,
    rep_nation,
    rep_voters_raw,
    rep_all_raw,
    relative_importance,
    relative_binary,
    relative_ratio,
    erststimme,
    erststimme_raw,
    party_name,
    vote_share,
    winner,
    winner_name,
    winner_vote_share,
    second,
    second_name,
    second_vote_share,
    margin,
    n_parties,
    zweitstimme,
    zweitstimme_raw,
    weight
  )


# Write csv data frames to save data
write.csv(districts_results,  file = "data_coded/districts_results.csv", row.names = FALSE)
write.csv(survey_data,  file = "data_coded/survey_data.csv", row.names = FALSE)
